test <- read.csv("Admission_Predict_Ver1.1.csv")
summary (test)
##    Serial.No.      GRE.Score      TOEFL.Score    University.Rating
##  Min.   :  1.0   Min.   :290.0   Min.   : 92.0   Min.   :1.000    
##  1st Qu.:125.8   1st Qu.:308.0   1st Qu.:103.0   1st Qu.:2.000    
##  Median :250.5   Median :317.0   Median :107.0   Median :3.000    
##  Mean   :250.5   Mean   :316.5   Mean   :107.2   Mean   :3.114    
##  3rd Qu.:375.2   3rd Qu.:325.0   3rd Qu.:112.0   3rd Qu.:4.000    
##  Max.   :500.0   Max.   :340.0   Max.   :120.0   Max.   :5.000    
##       SOP             LOR             CGPA          Research   
##  Min.   :1.000   Min.   :1.000   Min.   :6.800   Min.   :0.00  
##  1st Qu.:2.500   1st Qu.:3.000   1st Qu.:8.127   1st Qu.:0.00  
##  Median :3.500   Median :3.500   Median :8.560   Median :1.00  
##  Mean   :3.374   Mean   :3.484   Mean   :8.576   Mean   :0.56  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:9.040   3rd Qu.:1.00  
##  Max.   :5.000   Max.   :5.000   Max.   :9.920   Max.   :1.00  
##  Chance.of.Admit 
##  Min.   :0.3400  
##  1st Qu.:0.6300  
##  Median :0.7200  
##  Mean   :0.7217  
##  3rd Qu.:0.8200  
##  Max.   :0.9700
head(test)
##   Serial.No. GRE.Score TOEFL.Score University.Rating SOP LOR CGPA Research
## 1          1       337         118                 4 4.5 4.5 9.65        1
## 2          2       324         107                 4 4.0 4.5 8.87        1
## 3          3       316         104                 3 3.0 3.5 8.00        1
## 4          4       322         110                 3 3.5 2.5 8.67        1
## 5          5       314         103                 2 2.0 3.0 8.21        0
## 6          6       330         115                 5 4.5 3.0 9.34        1
##   Chance.of.Admit
## 1            0.92
## 2            0.76
## 3            0.72
## 4            0.80
## 5            0.65
## 6            0.90
attach(test)

#Linear Regression and some plots

#Here's a linear model (Chance of Admit)
linear <- lm(Chance.of.Admit ~., data=test)
#summary(linear)
plot(linear)

#Here's a linear model (University Rating)
linear <- lm(University.Rating ~., data=test)
#summary(linear)
plot(linear)

logmod <- glm(Research~., data=test)
#summary(logmod)
plot(logmod)

chance.vs.CGPA <- lm(test$Chance.of.Admit ~ test$CGPA)
plot(test$Chance.of.Admit ~ test$CGPA, xlab = "Chance of Admission", ylab = "CGPA", main = "Chance of Admission VS CGPA")
abline(chance.vs.CGPA , col="red", lwd=3, data = test)
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "data"
## is not a graphical parameter

#sum ((predict(chance.vs.CGPA, data.frame(test))) - test$Chance.of.Admit)^2 /nrow(test)

Variable Selection for Chance of Admittion

By performing backwards selection, we will remove the least significant values until all values are significant.

linear <- lm(Chance.of.Admit~ ., data = test )
#summary(linear)

#Remove University Ranking because it has the highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research , data = test )
#summary(linear)


#Remove SOP has the second highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score +LOR + CGPA + Research , data = test )

#Remove Serial No.
linear <- lm(Chance.of.Admit~  GRE.Score + TOEFL.Score +LOR + CGPA + Research , data = test )
#summary(linear)
#plot(test)
#linearPlot <- plot(Chance.of.Admit~.,data=test)

Variable Selection for Research

linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP +LOR + CGPA, data = test )
#summary(linear)

#Remove SOP
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating  +LOR + CGPA, data = test )
#summary(linear)

#Remove SOP, CGPA
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating  +LOR, data = test )
#summary(linear)

#Remove SOP, CGPA, LOR
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating, data = test )
#summary(linear)

#Remove SOP, CGPA, LOR, TOEFL
linear <- lm(Research~ Serial.No. + GRE.Score + University.Rating, data = test )
#summary(linear)

#Remove SOP, CGPA, LOR, TOEFL, Serial Number
linear <- lm(Research~ + GRE.Score + University.Rating, data = test )
summary(linear)
## 
## Call:
## lm(formula = Research ~ +GRE.Score + University.Rating, data = test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14033 -0.35017  0.00906  0.29255  1.00181 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.415603   0.625451 -10.258   <2e-16 ***
## GRE.Score          0.021546   0.002099  10.266   <2e-16 ***
## University.Rating  0.050337   0.020731   2.428   0.0155 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4089 on 497 degrees of freedom
## Multiple R-squared:  0.3254, Adjusted R-squared:  0.3227 
## F-statistic: 119.9 on 2 and 497 DF,  p-value: < 2.2e-16
plot(linear)

##based on the Normal Q-Q Plot, we can determine that the the data fits well

Variable Selection for University Ranking

linear <- lm(University.Rating~ Serial.No. + GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = test )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ Serial.No. + GRE.Score + TOEFL.Score + 
##     SOP + LOR + CGPA + Research, data = test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.34352 -0.46556 -0.03557  0.44046  2.44809 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.4170319  1.2125399  -5.292 1.82e-07 ***
## Serial.No.   0.0001812  0.0002260   0.802  0.42307    
## GRE.Score    0.0065910  0.0059476   1.108  0.26833    
## TOEFL.Score  0.0209679  0.0103520   2.025  0.04336 *  
## SOP          0.4474027  0.0507642   8.813  < 2e-16 ***
## LOR          0.1498125  0.0488318   3.068  0.00227 ** 
## CGPA         0.3578395  0.1141124   3.136  0.00182 ** 
## Research     0.0923371  0.0783086   1.179  0.23891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7109 on 492 degrees of freedom
## Multiple R-squared:  0.619,  Adjusted R-squared:  0.6135 
## F-statistic: 114.2 on 7 and 492 DF,  p-value: < 2.2e-16
#Remove Serial Number
linear <- lm(University.Rating~  GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = test )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ GRE.Score + TOEFL.Score + SOP + 
##     LOR + CGPA + Research, data = test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.36251 -0.47140 -0.04223  0.45376  2.41297 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.295220   1.202548  -5.235 2.45e-07 ***
## GRE.Score    0.006468   0.005943   1.088  0.27705    
## TOEFL.Score  0.020128   0.010295   1.955  0.05114 .  
## SOP          0.441757   0.050255   8.790  < 2e-16 ***
## LOR          0.154072   0.048524   3.175  0.00159 ** 
## CGPA         0.364222   0.113793   3.201  0.00146 ** 
## Research     0.096184   0.078133   1.231  0.21890    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7106 on 493 degrees of freedom
## Multiple R-squared:  0.6185, Adjusted R-squared:  0.6138 
## F-statistic: 133.2 on 6 and 493 DF,  p-value: < 2.2e-16
#Remove GRE
linear <- lm(University.Rating~   TOEFL.Score + SOP +LOR + CGPA + Research, data = test )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA + 
##     Research, data = test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.37560 -0.47448 -0.03629  0.45065  2.41676 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.243653   0.715856  -7.325 9.79e-13 ***
## TOEFL.Score  0.025353   0.009109   2.783  0.00559 ** 
## SOP          0.440906   0.050259   8.773  < 2e-16 ***
## LOR          0.151540   0.048478   3.126  0.00188 ** 
## CGPA         0.414718   0.103920   3.991 7.59e-05 ***
## Research     0.120784   0.074805   1.615  0.10702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7107 on 494 degrees of freedom
## Multiple R-squared:  0.6176, Adjusted R-squared:  0.6137 
## F-statistic: 159.5 on 5 and 494 DF,  p-value: < 2.2e-16
#Remove Research
linear <- lm(University.Rating~   TOEFL.Score + SOP +LOR + CGPA, data = test )
summary(linear)
## 
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA, 
##     data = test)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46231 -0.46269 -0.04935  0.45262  2.39211 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.62010    0.67792  -8.290 1.07e-15 ***
## TOEFL.Score  0.02695    0.00907   2.971  0.00311 ** 
## SOP          0.44423    0.05030   8.832  < 2e-16 ***
## LOR          0.15563    0.04849   3.210  0.00142 ** 
## CGPA         0.44360    0.10254   4.326 1.83e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7119 on 495 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.6124 
## F-statistic: 198.1 on 4 and 495 DF,  p-value: < 2.2e-16

LogMod Backwards Selection Research

logmod <- glm(Research~., data=test)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ ., data = test)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.01724  -0.33223   0.00753   0.29143   0.99776  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -4.738e+00  7.931e-01  -5.974 4.44e-09 ***
## Serial.No.         7.426e-05  1.313e-04   0.565 0.572113    
## GRE.Score          1.921e-02  3.327e-03   5.776 1.36e-08 ***
## TOEFL.Score       -8.741e-03  5.980e-03  -1.462 0.144417    
## University.Rating  2.412e-02  2.566e-02   0.940 0.347657    
## SOP                1.441e-02  3.108e-02   0.464 0.643147    
## LOR                1.404e-02  2.840e-02   0.494 0.621210    
## CGPA              -9.398e-02  7.457e-02  -1.260 0.208213    
## Chance.of.Admit    1.065e+00  3.066e-01   3.474 0.000557 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1633439)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  80.202  on 491  degrees of freedom
## AIC: 523.91
## 
## Number of Fisher Scoring iterations: 2
plot(logmod)

#Removed LOR
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP + CGPA, data=test)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score + 
##     University.Rating + SOP + CGPA, data = test)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.09472  -0.33171   0.01616   0.28395   1.02222  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.3439527  0.6572538  -9.652  < 2e-16 ***
## Serial.No.         0.0001857  0.0001291   1.438    0.151    
## GRE.Score          0.0216466  0.0032778   6.604 1.04e-10 ***
## TOEFL.Score       -0.0054357  0.0059720  -0.910    0.363    
## University.Rating  0.0344431  0.0256319   1.344    0.180    
## SOP                0.0302826  0.0298521   1.014    0.311    
## CGPA               0.0443213  0.0648898   0.683    0.495    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1670782)
## 
##     Null deviance: 123.20  on 499  degrees of freedom
## Residual deviance:  82.37  on 493  degrees of freedom
## AIC: 533.24
## 
## Number of Fisher Scoring iterations: 2
#plot(logmod)

#Removed LOR, CGPA
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP , data=test)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score + 
##     University.Rating + SOP, data = test)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.09070  -0.33673   0.01374   0.28546   1.03378  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.4208255  0.6471960  -9.921  < 2e-16 ***
## Serial.No.         0.0001936  0.0001285   1.506    0.133    
## GRE.Score          0.0225907  0.0029705   7.605 1.45e-13 ***
## TOEFL.Score       -0.0042408  0.0057070  -0.743    0.458    
## University.Rating  0.0375039  0.0252235   1.487    0.138    
## SOP                0.0358048  0.0287209   1.247    0.213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1668978)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  82.448  on 494  degrees of freedom
## AIC: 531.72
## 
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL
logmod <- glm(Research~Serial.No. + GRE.Score  + University.Rating + SOP , data=test)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating + 
##     SOP, data = test)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.09592  -0.34393   0.00147   0.29124   1.03427  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.3837643  0.6449795  -9.898   <2e-16 ***
## Serial.No.         0.0002017  0.0001280   1.575    0.116    
## GRE.Score          0.0210992  0.0021887   9.640   <2e-16 ***
## University.Rating  0.0346801  0.0249243   1.391    0.165    
## SOP                0.0319968  0.0282472   1.133    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1667468)
## 
##     Null deviance: 123.20  on 499  degrees of freedom
## Residual deviance:  82.54  on 495  degrees of freedom
## AIC: 530.27
## 
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP
logmod <- glm(Research~Serial.No. + GRE.Score  + University.Rating , data=test)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating, 
##     data = test)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.10835  -0.34957   0.00049   0.28952   1.02269  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.5389338  0.6304444 -10.372   <2e-16 ***
## Serial.No.         0.0001855  0.0001272   1.458   0.1455    
## GRE.Score          0.0217887  0.0021030  10.361   <2e-16 ***
## University.Rating  0.0504027  0.0207077   2.434   0.0153 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.166842)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  82.754  on 496  degrees of freedom
## AIC: 529.57
## 
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP, Serial Number
logmod <- glm(Research~ GRE.Score  + University.Rating , data=test)
summary(logmod)
## 
## Call:
## glm(formula = Research ~ GRE.Score + University.Rating, data = test)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.14033  -0.35017   0.00906   0.29255   1.00181  
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.415603   0.625451 -10.258   <2e-16 ***
## GRE.Score          0.021546   0.002099  10.266   <2e-16 ***
## University.Rating  0.050337   0.020731   2.428   0.0155 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1672199)
## 
##     Null deviance: 123.200  on 499  degrees of freedom
## Residual deviance:  83.108  on 497  degrees of freedom
## AIC: 529.71
## 
## Number of Fisher Scoring iterations: 2
plot(logmod)

#based on the logMod summary, the 2 most signifant variables are University Rating and GRE.Score. Based on these results, as the University Rating increases, so does then number of students who do conduct Research.

CVs

CV for linear model - Chance of Admission with CGPA - Manual Leave on Out

set.seed(7861)

cvlm <- list()
msecv <- NA
for(i in 1:nrow(test)){
  #Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(CGPA[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.06879746

CV for linear model - Chance of Admittion - Manual Leave on Out

set.seed(7861)

cvlm <- list()
msecv <- NA
for(i in 1:nrow(test)){
  #Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(Serial.No.[-i] + GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.0666215

CV for linear model - Research

ResearchData <- test$Research
ResearchDataFactor <- factor(test$Research)

simlog<-glm(factor(Research)~., family = "binomial", data = test)
table(predict(simlog, type = "response")>0.5, ResearchData)
##        ResearchData
##           0   1
##   FALSE 154  57
##   TRUE   66 223
misclassificationRate <- (57+66)/(154+223)
capture.output(cat('Misclassification rate = ', misclassificationRate))
## [1] "Misclassification rate =  0.3262599"
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
F1<- F1_Score(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Accu <- Accuracy(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Sens <- Sensitivity(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)

scoreTable <-cbind(F1, Accu, Sens)
colnames(scoreTable)<-c("F1 Score", "Accuracy", "Sensitivity")
rownames(scoreTable)<-c("Logistic Regression")
#rownames(scoreTable)<-c("Logistic Regression", "Neural Network")
round(scoreTable,3)
##                     F1 Score Accuracy Sensitivity
## Logistic Regression    0.715    0.754        0.73

CV for linear model - University Rating - Manual Leave on Out

set.seed(7861)

cvlm <- list()
msecv <- NA
for(i in 1:nrow(test)){
  #Fit the linear model
cvlm[[i]] <- lm(University.Rating[-i] ~ TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i]))-University.Rating[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 3.373402